

cd "../replication-package"

/*
   This do-file cleans the control varaibles for the sexism paper. 
   Generating the version with ASEC to be consistent for our data source. 
   
   1. women's early marriage
   2. women's level of education (% BA)
   3. average level of education (% BA)
   3. median hourly wage by state 
   4. % of immigrants
   5. % non-white 
   
  
*/

use "data/raw/cps_00088.dta", clear
tab asecflag // make sure only including ASEC 
keep if inrange(year, 1983, 2021) // 'last year' earnings 
keep if inrange(age, 18, 65)

merge m:1 year using "data/ipums_topcoding.dta", gen(m_topcode)
tab year m_topcode // only pre-1983 not matched
keep if m_topcode==3 
drop m_topcode 

// matching state abbreviations
gen state_a = statefip
label define state_a  1 "AL" 2 "AK" 5 "AR" 4 "AZ" 6 "CA" 8 "CO" 9 "CT" 10 "DE" ///
			11 "DC" 12 "FL" 13 "GA" 15 "HI" 16 "ID" 19 "IA" 17 "IL" 18 "IN" 20 "KS" 21 "KY" 22 "LA" ///
			25 "MA" 24 "MD" 23 "ME" 26 "MI" 27 "MN" 29 "MO" 28 "MS" 30 "MT" 37 "NC" 38 "ND" ///
			31 "NE" 33 "NH" 34 "NJ" 35 "NM" 32 "NV" 36 "NY" 39 "OH" 40 "OK" 41 "OR" 42 "PA" ///
			44 "RI" 45 "SC" 46 "SD" 47 "TN" 48 "TX" 49 "UT" 51 "VA" 50 "VT" 53 "WA" 55 "WI" ///
			54 "WV" 56 "WY", replace
label val state_a state_a
order state_a, after(statefip)
label var state_a "Abbreviated State Names"

** calculate hourly wages first, and save 
*** 1. Top-coding & CPI adjustment (universal to all specifications)
	foreach incvar in incwage inclongj oincwage {
		recode `incvar' (99999999=.), gen(tc_`incvar') // 99999999 = NIU 
	}

	replace tc_incwage = tc_incwage*1.5 if tc_incwage>=incwage_top & tc_incwage!=. & year<=1987
	// 'incwage_top' given by the merged data with top-coding values 
	//sum tc_incwage if year<=1987

	// 1988 and onward, 'incwage' is the sum of inclongj and oincwage 
	// https://cps.ipums.org/cps-action/variables/INCWAGE#description_section
	foreach incvar in inclongj oincwage {
		replace tc_`incvar' = tc_`incvar'*1.5 if tc_`incvar'>=`incvar'_top & tc_`incvar'!=. & year>=1988	
	}
	replace tc_incwage = tc_inclongj + tc_oincwage if year>=1988

	sum tc_*

	*** Calculate hourly earnings 
	// weeks worked last year? 
	tab wkswork1 srcearn, m
		//: note that weeks worked include non-wage working 
	
	// usual hours worked last year? 
	recode uhrsworkly (999=.)
	bys srcearn: sum uhrsworkly if inrange(srcearn, 1, 4)
		//: note that usual hours worked include non-wage working 

	// calculate hourly earnings 
	gen hrly_earn = tc_incwage / (wkswork1*uhrsworkly)
	sum hrly_earn

	// CPI adjustment 
	// use 2019 USD, not 2020 USD 
	gen hrly_earn99 = hrly_earn*cpi99
	gen hrly_earn19 = hrly_earn99*1.535
	sum hrly_earn*

	// make sure we're using actual year earnings obtained 
	gen earn_year = year - 1 
	drop year 
	rename earn_year year 

	*** 2. Specifications 
	tab year srcearn // only available from 1987 

	// 1) all hrly earnings 
	gen hrearn_v1 = hrly_earn19 
	label var hrearn_v1 "Hourly earnings in 2019 USD, ALL"

	// 2) only include wage/salary 
	// (source of income from the longest job was wage/salary)
	gen hrearn_v2 = hrly_earn19
	fre srcearn
	replace hrearn_v2 = . if srcearn!=1 & year>=1987
	label var hrearn_v2 "Hourly earnings in 2019 USD, WAGE/SALARY ONLY"

	// 3) only include wage/salary + extreme values adjusted 
	// truncating below $1 and above $100 in 1979 USD 
	// following Cha and Weeden 2014
	// also dropping zero earners 
	gen hrearn_v3 = hrearn_v2
	gen hrearn_1979 = hrearn_v3*0.644*0.436 
		//: first convert to 1999 USD, and then 1979 USD 
		//: https://cps.ipums.org/cps/cpi99.shtml
	replace hrearn_1979 = 1 if hrearn_1979<=1 & hrearn_1979>0 
	replace hrearn_1979 = 100 if hrearn_1979>=100 & hrearn_1979!=. 
	replace hrearn_v3 = hrearn_1979*2.295*1.535 
		//: convert first back to 1999 USD, and then 2019 USD 
	replace hrearn_v3 = . if srcearn!=1 & year>=1987
	replace hrearn_v3 = . if hrearn_v3==0 
	sum hrearn_v3, det 

	label var hrearn_v3 "Hourly earnings in 2019 USD, WAGE/SALARY ONLY & TRUNCATED"

	// all meausres 
	sum hrearn_v*

* median wage is from last year 
gen lnhrearn_v3 = ln(hrearn_v3 + 0.1)
preserve 
replace year = year  - 1
collapse (p50) median_cps = hrearn_v3 ///
		 (p50) lnmedian_cps = lnhrearn_v3 [pw=asecwt], by(state_a year)

save "data/state/07_medwage_ASEC.dta", replace 
restore 


**** --------------------------------- ****
**## Wage returns to longer work hours ##**
**** --------------------------------- ****

// following Ishizuka and Musick 2021, Demography 
// online Appendix:  https://dup.silverchair-cdn.com/dup/Content_public/Journal/demography/58/4/10.1215_00703370-9373598/3/ishizuka_inflex_esm.pdf?Expires=1697054915&Signature=vFxDnhOsvG5DhhWTP-DxyYNTpAgAccCRSihnEU7pQ7T3Vp0q-4DmzCpyktsNHmGPdHkD6zOdZuOiDIl8LRLxAXdYddxlOPNFY3JY4qji2axyuswMecFB5Dge1pEHi~tNI3E9Dj1Rq4sKTkBmjZVcPFwMHPhRSiTccMTpDiQszQMw5ULBHJkPJn2RufbEF~vugJ5t9hiOdgmoTHbv~ehPWBSQ3i06t3-qa4qoTPwiMd~vBaqPzBzoTBc7REOvAtcxuIG4opMDrsY7PN-K287bx2EfIk~gFjvgW6Irjq5kCX5e0cqZYhjC74tDKSRrGl3F7p4W-Z~onBZS9BEEgeNDJw__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA
// regression of ln(annual earnings) on ln(annual weeks worked) + ln(weekly work hours) + occupation + occupation*ln(weekly work hours)
// a set of control variables, including gender, educational attainment (five levels), race/ethnicity, and age and age squared. 
// they calculated occupation-specific return to work hours - in this paper, we get state-specific return to work hours 
gen ln_annual = log(tc_incwage)
gen ln_wkswork = log(wkswork1)
gen ln_hrswork = log(uhrsworkly)

recode educ (2=13.5) (10=2.5) (11=1) (12=2) (13=3) (14=4) ///
			(20=5.5) (21=5) (22=6) (30=7.5) (31=7) (32=8) (40=9) (50=10) (60=11) ///
			(70/71=11.5) (72/73=12) (80=13) (81=13.5) (90/92=15) ///
			(100=15) (110=15.5) (111/122=16) (123=18) (124/125=20), gen(edyrs)
recode edyrs (0/11.5=1 "Less than HS") (12=2 " HS grad") (13/15.5=3 "Some college") (16=4 "Bachelor's") (17/20=5 "Advanced"), gen(edcat)

label def racelb 1 "NH White" 2 "NH Black" 3 "Hispanic" 4 "NH Other"
gen nhwht = 0
replace nhwht = 1 if race==100 & inlist(hispan, 0, 901, 902) // 100=white; 0=not hispanic; 901/902 missing or dk
label var nhwht "Non-Hispanic White dummy"
gen nhblk = 0
replace nhblk = 1 if race==200 & inlist(hispan, 0, 901, 902) // 200=black; 0=not hispanic; 901/902 missing or dk
label var nhblk "Non-Hispanic Black dummy"
gen hispd = 0
replace hispd = 1 if inrange(hispan, 100, 612) // 100~500 = hispan. 
label var hispd "Hispanic dummy"
gen nhoth = 0
replace nhoth = 1 if race>=300 & inlist(hispan, 0, 901, 902) 
label var nhoth "Non-Hispanic other race dummy"
gen reth = 0
replace reth = 1 if nhwht==1
replace reth = 2 if nhblk==1
replace reth = 3 if hispd==1
replace reth = 4 if nhoth==1
label val reth racelb

** check_agerange 
sum age 

// limiting to 1982+ because in some years, not all states are collected 
forval y = 1982/2020 {
	di "									"
	di " 		Result for year `y'		"
	di "									"
	qui reg ln_annual c.ln_wkswork c.ln_hrswork##i.state_a ///
					  i.sex ib2.edcat ib1.reth c.age##c.age if year==`y'
	est store m_state
	qui levelsof state_a, local(state_list)
	qui foreach st of local state_list {
		
		qui est restore m_state 
		if `st'==1 {
			mat ST`st' = _b[ln_hrswork]
		}
		
		else {
		mat ST`st' = _b[ln_hrswork] + _b[`st'.state_a#ln_hrswork]
		}
	
		preserve 
		clear 
		svmat ST`st'
		rename ST`st'1 wrkhr_return
		gen state_num = `st'
		gen year = `y'
		tempfile state`st'_year`y'
		save `state`st'_year`y''
		restore 
		
		}
	
	qui levelsof state_a if state_a!=1, local(new_state_list)
	preserve 
	clear 
	use `state1_year`y''
	foreach st of local new_state_list {
		append using `state`st'_year`y''
		}
	tempfile res_year`y'
	save `res_year`y''
	restore 
}

// final appending  
use `res_year1982', clear 
forval y = 1983/2020 {
	append using `res_year`y''
}
gen state_a = state_num
label define state_a  1 "AL" 2 "AK" 5 "AR" 4 "AZ" 6 "CA" 8 "CO" 9 "CT" 10 "DE" ///
		11 "DC" 12 "FL" 13 "GA" 15 "HI" 16 "ID" 19 "IA" 17 "IL" 18 "IN" 20 "KS" 21 "KY" 22 "LA" ///
		25 "MA" 24 "MD" 23 "ME" 26 "MI" 27 "MN" 29 "MO" 28 "MS" 30 "MT" 37 "NC" 38 "ND" ///
		31 "NE" 33 "NH" 34 "NJ" 35 "NM" 32 "NV" 36 "NY" 39 "OH" 40 "OK" 41 "OR" 42 "PA" ///
		44 "RI" 45 "SC" 46 "SD" 47 "TN" 48 "TX" 49 "UT" 51 "VA" 50 "VT" 53 "WA" 55 "WI" ///
		54 "WV" 56 "WY", replace
label val state_a state_a

// checking whether similar with Ishizuka and Musick 2021
sum wrkhr_return if inrange(year, 2004, 2013)
//: mean 0.93 (median 0.94); std.dev 0.096 
//: I & M had mean of 0.83 and std.dev of 0.19 

save "data/state/07_wrkhr_return_ASEC.dta", replace 



**** ------------------ ****
**## gender poverty gap ##**
**** ------------------ ****

// IPUMS created the poverty status variable 
// POVERTY is a 3-digit numeric code expressing each family's total income for the previous year as a percentage of the poverty thresholds established by the Social Security Administration in 1964 and subsequently revised in 1980, adjusted for inflation (See Poverty Definition Page)
// https://usa.ipums.org/usa-action/variables/POVERTY#codes_section
clear all 
use "data/raw/cps_00127.dta" 
keep if inrange(age, 18, 65)

*:: Census/ACS 
// 000 = N/A
// 001 = 1 percent or less of poverty threshold (including 0 or negative income)
// 501 = 501 percent or more of poverty threshold 

*:: ASEC 
// 10 = below poverty 
// 21 = 100-124% of the low-income level 
// 22 = 125-149% of the low-income level 
// 23 = 150+% of the low-income level 
gen poverty_dum = .
replace poverty_dum = 1 if poverty==10 & poverty!=. 
replace poverty_dum = 0 if inlist(poverty, 21, 22, 23) & poverty!=. 

tab year poverty_dum, row m

forval s = 1/2 {
	preserve 
	keep if sex ==`s'
	collapse (mean) povdum_`s' = poverty_dum [pw=asecwt], by(statefip year)
	gen sex = `s'
	tempfile poverty`s'
	save `poverty`s''
	restore 
}

use `poverty1', clear 
merge 1:1 statefip year using `poverty2', nogen
drop sex
rename povdum_1 poverty_m 
rename povdum_2 poverty_w 
gen poverty_gap =  poverty_w / poverty_m
label var poverty_gap "Women-to-men poverty status gap"

gen state_a = statefip 
label define state_a  1 "AL" 2 "AK" 5 "AR" 4 "AZ" 6 "CA" 8 "CO" 9 "CT" 10 "DE" ///
		11 "DC" 12 "FL" 13 "GA" 15 "HI" 16 "ID" 19 "IA" 17 "IL" 18 "IN" 20 "KS" 21 "KY" 22 "LA" ///
		25 "MA" 24 "MD" 23 "ME" 26 "MI" 27 "MN" 29 "MO" 28 "MS" 30 "MT" 37 "NC" 38 "ND" ///
		31 "NE" 33 "NH" 34 "NJ" 35 "NM" 32 "NV" 36 "NY" 39 "OH" 40 "OK" 41 "OR" 42 "PA" ///
		44 "RI" 45 "SC" 46 "SD" 47 "TN" 48 "TX" 49 "UT" 51 "VA" 50 "VT" 53 "WA" 55 "WI" ///
		54 "WV" 56 "WY", replace
label val state_a state_a
drop statefip 

save "data/state/07_gender_poverty_gap_ASEC.dta", replace 


* ----------------------------------- *
* ----- Other control variables ----- *
* ----------------------------------- *
use "data/raw/cps_00088.dta", clear

keep if inrange(age, 18, 65)

// matching state abbreviations
gen state_a = statefip
label define state_a  1 "AL" 2 "AK" 5 "AR" 4 "AZ" 6 "CA" 8 "CO" 9 "CT" 10 "DE" ///
			11 "DC" 12 "FL" 13 "GA" 15 "HI" 16 "ID" 19 "IA" 17 "IL" 18 "IN" 20 "KS" 21 "KY" 22 "LA" ///
			25 "MA" 24 "MD" 23 "ME" 26 "MI" 27 "MN" 29 "MO" 28 "MS" 30 "MT" 37 "NC" 38 "ND" ///
			31 "NE" 33 "NH" 34 "NJ" 35 "NM" 32 "NV" 36 "NY" 39 "OH" 40 "OK" 41 "OR" 42 "PA" ///
			44 "RI" 45 "SC" 46 "SD" 47 "TN" 48 "TX" 49 "UT" 51 "VA" 50 "VT" 53 "WA" 55 "WI" ///
			54 "WV" 56 "WY", replace
label val state_a state_a
order state_a, after(statefip)
label var state_a "Abbreviated State Names"

*----- 1. % of young female who are EVER married -----*
// age range: 18-24
gen fem = sex==2 
gen yngfem = 1 if inrange(age, 18, 24) & fem==1 
gen evermarr1824 = 0 if yngfem==1 
replace evermarr1824 = 1 if inrange(marst, 1, 5) & yngfem==1 
replace evermarr1824 = . if yngfem==0 


*----- 2. % of women with BA -----*
recode educ (0/100=0) (110/125=1), gen(badum)
gen fembadum = 0 if fem==1 
replace fembadum = 1 if badum==1 
replace fembadum = . if fem==0 

*----- 3. % non-white -----*
gen racecat = .
replace racecat = 1 if race==100 & inlist(hispan, 0, 901, 902) // NH Wh
replace racecat = 2 if race==200 & inlist(hispan, 0, 901, 902) // NH Bl
replace racecat = 3 if inrange(hispan, 100, 612)	           // Hispanic
replace racecat = 4 if race>=300 & inlist(hispan, 0, 901, 902) // NH Oth 
label def racelb 1 "NH Wh" 2 "NH Bl" 3 "Hispanic" 4 "NH Oth"
label val racecat racelb

gen nhwhite_d = racecat==1 
gen nhblack_d = racecat==2 
gen hispan_d  = racecat==3 
gen nhother_d = racecat==4



*----- 4. collapsing -----*
gen n = 1 

collapse (mean) femmar1824_cps = evermarr1824 ///
		 (mean) badum_cps      = badum ///
         (mean) fembadum_cps   = fembadum ///
		 (mean) nhblack_d_cps  = nhblack_d ///
		 (mean) hispan_d_cps   = hispan_d ///
		 (mean) nhother_d_cps  = nhother_d ///
		 (sum)  obs_cps = n ///
		 [pw=asecwt], by(state_a year)
sort state_a year
save "data/state/07_controls_ASEC.dta", replace















